# Handling Missing Values 

In [1]:
# import pandas 
import pandas as pd 

In [2]:
# read ufo data 
ufo = pd.read_csv("http://bit.ly/uforeports")

In [3]:
# last 5 rows 
ufo.tail() 

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45
18240,Ybor,,OVAL,FL,12/31/2000 23:59


In [4]:
# check missing values 
ufo.isnull() 

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False
...,...,...,...,...,...
18236,False,True,False,False,False
18237,False,True,False,False,False
18238,False,True,True,False,False
18239,False,False,False,False,False


__Note__
1. True: Missing 
2. False: Not Missing

In [5]:
# using notnull() 
ufo.notnull() 

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,True,False,True,True,True
1,True,False,True,True,True
2,True,False,True,True,True
3,True,False,True,True,True
4,True,False,True,True,True
...,...,...,...,...,...
18236,True,False,True,True,True
18237,True,False,True,True,True
18238,True,False,False,True,True
18239,True,True,True,True,True


__Note__
1. axis = 0: Rows 
2. axis = 1: Columns 

In [8]:
# sum of missing values: by default axis = 0
ufo.isnull().sum() 

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [7]:
# Let's create a series 
pd.Series([True, False, True]).sum() 

2

In [9]:
# filtering using isnull() 
ufo[ufo.City.isnull()]

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,,LA,8/15/1943 0:00
22,,,LIGHT,LA,8/15/1943 0:00
204,,,DISK,CA,7/15/1952 12:30
241,,BLUE,DISK,MT,7/4/1953 14:00
613,,,DISK,NV,7/1/1960 12:00
1877,,YELLOW,CIRCLE,AZ,8/15/1969 1:00
2013,,,,NH,8/1/1970 9:30
2546,,,FIREBALL,OH,10/25/1973 23:30
3123,,RED,TRIANGLE,WV,11/25/1975 23:00
4736,,,SPHERE,CA,6/23/1982 23:00


In [10]:
# Check specific column
ufo.City.isnull().sum() 

25

## Drop Missing Values 

In [11]:
# shape 
ufo.shape 

(18241, 5)

In [12]:
# drop missing: drop row contains missing values 
# it is inplace = False 
ufo.dropna(how='any').shape

(2486, 5)

In [13]:
# how=all 
ufo.dropna(how='all').shape 

(18241, 5)

In [14]:
# subset: any 
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape

(15576, 5)

In [15]:
# subset: all 
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape

(18237, 5)

## Filling Missing Values 

In [19]:
# value counts: by default drop = True 
ufo["Shape Reported"].value_counts() 

LIGHT        2803
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
ROUND           2
CRESCENT        2
PYRAMID         1
HEXAGON         1
DOME            1
FLARE           1
Name: Shape Reported, dtype: int64

In [20]:
# value counts: false 
ufo["Shape Reported"].value_counts(dropna=False) 

LIGHT        2803
NaN          2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
PYRAMID         1
DOME            1
FLARE           1
HEXAGON         1
Name: Shape Reported, dtype: int64

In [22]:
# fillna() 
ufo["Shape Reported"].fillna(value="VARIOUS", inplace=True)

In [23]:
# now take a look 
ufo["Shape Reported"].value_counts() 

VARIOUS      2977
LIGHT        2803
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
PYRAMID         1
HEXAGON         1
DOME            1
FLARE           1
Name: Shape Reported, dtype: int64